********************************************************************************
* match_effect_estimation.do
* Purpose: Estimate worker-firm match effects using Woodcock's (2008) model.
*
* Woodcock's decomposition separates log wages into:
*   log w_it = alpha_i + psi_j + mu_ij + X_it*beta + epsilon_it
* where mu_ij is a worker-firm match effect (complementarity component).
*
* Estimation procedure:
*   1. Identify the largest connected set of worker-firm pairs
*   2. Partial out year effects from log earnings
*   3. Control for tenure within each worker-firm match (absorb match FE via areg)
*   4. Compute mean tenure-adjusted earnings per match (mu_ij + worker FE + firm FE)
*   5. Use reghdfe to separate worker FE and firm FE from the match-mean
*   6. The match effect is the residual: mu_ij = mean_adj - worker FE - firm FE
*
* Sample restrictions mirror akm_estimation.do, except:
*   - Never-movers are NOT excluded here (match effects are identified within a match)
*   - first_year_at_firm is retained to compute tenure
*
* Input : $data/worker_firm_panel.dta
*         $data/canada_cpi.dta
*         $data/first_mna.dta
*         $data/firm_level_emp.dta
*
* Output: $data/match_effect.dta
*         Variables: casenum2019, entid_syn, logearnings_met (match effect)
*         One row per worker-firm pair.
*
* Requires: reghdfe, group2hdfe
********************************************************************************
use $data/worker_firm_panel, clear

* Create a unique identifier for each worker-firm pair (match)
gegen match = group(casenum2019 entid_syn)

//------------------------------------------------------------------------------
// STEP 1: IDENTIFY MOVERS (same logic as akm_estimation.do)
//------------------------------------------------------------------------------
gsort casenum2019 year entid_syn
by casenum2019: gen gap		= year[_n+1] - year
gen unemployed = ( gap > 1 & ~mi(gap) )

gsort casenum2019 entid_syn year
gegen last_year_of_data	 = max(year), by(casenum2019)
by casenum2019 entid_syn: gen year_forw = year[_n+1] - year

gen		moved	= 1 if year == last_year_at_firm
replace moved	= 1 if year_forw > 1 & ~mi(year_forw)
replace moved	= 1 if unemployed == 1
replace moved	= 0 if year == last_year_of_data
replace moved	= 0 if mi(moved)
gegen ever_moved	=	max(moved), 	by(casenum2019)


//------------------------------------------------------------------------------
// STEP 2: EARNINGS AND SAMPLE RESTRICTIONS
//------------------------------------------------------------------------------
merge m:1 year 				using $data/canada_cpi, keep(1 3) nogen
gen 	CPI_base_2011 	= CPI/119.9
gen		t4earn_adjusted = t4earn/CPI_base_2011
gen 	logearnings 	= log(t4earn_adjusted)

merge m:1 entid_syn 		using $data/first_mna, 	keep(1 3) keepusing(DEAL_YEAR) nogen
merge m:1 entid_syn year 	using $data/firm_level_emp, keep(1 3) keepusing(PD7_AvgEmp_NonZero) nogen

* Sample restrictions (note: never-movers are NOT dropped here, unlike AKM)
drop if PD7_AvgEmp_NonZero < 5
drop if mi(PD7_AvgEmp_NonZero)
drop if t4earn_adjusted < 3900
drop if mi(t4earn_adjusted)
drop if year == DEAL_YEAR                    // exclude the deal year from the estimation sample


//------------------------------------------------------------------------------
// STEP 3: MATCH EFFECTS ESTIMATION (Woodcock 2008)
//------------------------------------------------------------------------------

* Drop workers observed only once
bys casenum2019: gen panelworker = _N
drop if panelworker == 1
drop panelworker

* Identify the largest connected set (workers and firms linked through mobility)
group2hdfe casenum2019 entid_syn , group(mygroup) largest(largest_connected_set) verbose
keep if largest_connected_set==1
drop mygroup largest_connected_set

* NOTE: first_year_at_firm is needed below for tenure computation
* (This note was flagged as an issue in the original code)
keep casenum2019 entid_syn logearnings match year first_year_at_firm

* Step 1: Remove year effects
reg logearnings i.year
predict logearnings_tilde, resid

* Step 2: Control for tenure within each match (Woodcock's tenure correction).
*   The idea: within a worker-firm spell, earnings increase mechanically with tenure.
*   By controlling for tenure and absorbing match FEs, we isolate the match-specific
*   level component (not the slope).
gen 	tenure = year - first_year_at_firm
label var tenure "Tenure in years (0,1,2...)"

* absorb(match) removes the match fixed effect; _b[tenure] captures the tenure slope
areg logearnings_tilde tenure, absorb(match)

* Tenure-adjusted earnings: remove the (estimated) linear tenure effect
gen logearnings_test = logearnings_tilde-(_b[tenure]*tenure)

* Step 3: Compute the mean tenure-adjusted earnings for each worker-firm match.
*   mean_logearnings_t = alpha_i + psi_j + mu_ij  (all time-varying components removed)
gegen mean_logearnings_t	=	mean(logearnings_test), by(match)

* Step 4: Separate worker FE (logearnings_pet) and firm FE (logearnings_fet)
*   from the match-level mean earnings using reghdfe.
reghdfe mean_logearnings_t, absorb(logearnings_pet = casenum2019 logearnings_fet = entid_syn) tol(0.001) verbose (1)

* Step 5: Match effect = mean earnings - worker FE - firm FE
gen logearnings_met = mean_logearnings_t-(logearnings_pet+logearnings_fet)

* Keep one row per worker-firm match
keep casenum2019 entid_syn logearnings_met match
bys match: gen seq=_n
keep if seq==1
drop seq

compress
save  $data/match_effect.dta , replace
